Supplementary material S2: bioinformatics command lines and parameters used in RNAseq data analysis.


#################################################
# Run the script in project folder
#################################################

################################################
# FastQC
fastqc=~/FastQC/
$fastqc/fastqc -t 2 data/raw/DM_1.fastq.gz data/raw/DM_2.fastq.gz -o output

# Manual check the output to assess the read's quality

#################################################
# Define variables
resources="/home/bioinfo/resources"
bbtools="/home/bioinfo/bbmap"
db="/home/bioinfo/db"
cap3="/home/bioinfo/CAP3"

## Sketch
$bbtools/sendsketch.sh in=data/raw/DM_1.fastq.gz out=output/DM_1_nt.txt reads=200000 fname=DM_1.fastq.gz minprob=0.2 samplerate=1.0 merge printname0=f records=20 overwrite=true color=false depth depth2 unique2 volume sortbyvolume contam2=genus nt ow printscore
$bbtools/sendsketch.sh in=data/raw/DM_2.fastq.gz out=output/DM_2_nt.txt reads=200000 fname=DM_2.fastq.gz minprob=0.2 samplerate=1.0 merge printname0=f records=20 overwrite=true color=false depth depth2 unique2 volume sortbyvolume contam2=genus nt ow printscore

#################################################
# File preparation
### Clumpify
$bbtools/clumpify.sh pigz=t unpigz=t zl=4 reorder in1=data/raw/DM_1.fastq.gz out1=tmp/TEMP1_1.fastq.gz in2=data/raw/DM_2.fastq.gz out2=tmp/TEMP1_2.fastq.gz passes=1 -Xmx75g

#################################################
## Cleaning
### Step1 - adapter removal
$bbtools/bbduk.sh -Xmx75G -Xms75G -threads=20 ktrim=r ordered minlen=49 minlenfraction=0.33 mink=11 tbo tpe rcomp=t overwrite=true k=23 hdist=1 hdist2=1 ftm=5 pigz=t unpigz=t zl=4 ow=true in1=tmp/TEMP1_1.fastq.gz out1=tmp/TEMP2_1.fastq.gz in2=tmp/TEMP1_2.fastq.gz out2=tmp/TEMP2_2.fastq.gz rqc=hashmap outduk=output/ktrim_kmerStats1.txt stats=output/ktrim_scaffoldStats1.txt loglog ref=$resources/adapters.fa

### Step2 - artefacts removal
$bbtools/bbduk.sh -Xmx75G -Xms75G -threads=20 maq=10,0 trimq=25 qtrim=r ordered overwrite=true maxns=1 minlen=49 minlenfraction=0.33 k=25 hdist=1 pigz=t unpigz=t zl=6 cf=t barcodefilter=crash ow=true in1=tmp/TEMP2_1.fastq.gz out1=tmp/TEMP3_1.fastq.gz in2=tmp/TEMP2_2.fastq.gz out2=tmp/TEMP3_2.fastq.gz outm=output/synth1.fq.gz rqc=hashmap outduk=output/kmerStats1.txt stats=output/scaffoldStats1.txt loglog ref=$resources/phix_adapters.fa.gz,$resources/lambda.fa.gz,$resources/sequencing_artifacts.fa.gz

### Step3 - short sequences removal
$bbtools/bbduk.sh -Xmx75G -Xms75G -threads=20 ordered overwrite=true k=20 hdist=1 pigz=t unpigz=t zl=6 ow=true in1=tmp/TEMP3_1.fastq.gz out1=tmp/TEMP4_1.fastq.gz in2=tmp/TEMP3_2.fastq.gz out2=tmp/TEMP4_2.fastq.gz outm=output/synth2.fq.gz outduk=output/kmerStats2.txt stats=output/scaffoldStats2.txt loglog ref=$resources/short.fa

### Step4 - ribosomial sequences removal
$bbtools/bbduk.sh -Xmx75G -Xms75G -threads=20 ordered k=31 ref=$resources/ribokmers.fa.gz ow=true in1=tmp/TEMP4_1.fastq.gz out1=tmp/TEMP5_1.fastq.gz in2=tmp/TEMP4_2.fastq.gz out2=tmp/TEMP5_2.fastq.gz outm=output/ribo.fq.gz outduk=output/ribo_Stats1.txt stats=output/ribo_Stats2.txt

### Step5 - human mouse cat dog sequences removal
$bbtools/bbmap.sh ordered -Xmx100g k=14 -threads=30 idtag=t usemodulo printunmappedcount ow=true qtrim=rl trimq=10 untrim kfilter=25 maxsites=1 tipsearch=0 minratio=.9 maxindel=3 minhits=2 bw=12 bwr=0.16 fast=true maxsites2=10 outm=human.fq.gz path=$resources/mousecatdoghuman/ refstats=refStats.txt pigz=t unpigz=t zl=9 in=tmp/TEMP5_1.fastq.gz in2=tmp/TEMP5_2.fastq.gz outu=tmp/TEMP6_1.fastq.gz outu2=tmp/TEMP6_2.fastq.gz

### Step6
$bbtools/bbmerge.sh -Xmx75G -Xms75G -threads=20 loose overwrite=true in1=tmp/TEMP6_1.fastq.gz in2=tmp/TEMP6_2.fastq.gz ihist=output/ihist_merge.txt outc=output/cardinality.txt pigz=t unpigz=t zl=9 adapters=$resources/adapters.fa
cp TEMP6_1.fastq.gz DM_1_map.fastq.gz
cp TEMP6_2.fastq.gz DM_2_map.fastq.gz

### Step7
$bbtools/bbnorm.sh -Xmx75G -Xms75G -threads=20 in1=tmp/TEMP6_1.fastq.gz out1=data/clean/DM_1_clean.fastq.gz in2=tmp/TEMP6_2.fastq.gz out2=data/clean/DM_2_clean.fastq.gz target=100 min=10
rm tmp/TEMP*.fastq.gz

#################################################
# Trinity: Trinity-v2.3.2
# From the folder where the data are
cd data/clean
Trinity --seqType fq --max_memory 200G --CPU 20 --left DM_1_clean.fastq.gz --right DM_2_clean.fastq.gz --no_normalize_reads --full_cleanup --verbose
cd ../..

#################################################
# CAP3 assembly
mkdir cap3_data
$cap3/cap3 Trinity.fasta -o 65 -p 90 > cap3_data/Trinity.fasta.cap3
mv cap.ace cap.contigs cap.contigs.links cap.contigs.qual cap.info cap.singlets cap3_data/
cat cap3_data/*cap.singlets cap3_data/*cap.contigs > DM_cap3.fasta

#################################################
# Diamond blast
diamond blastx -p10 -d $db/nr.dmnd -q DM_cap3.fasta -o DM_cap3-NCBInr.blastDiamond -f 0 -k 1 --max-hsps 2 -e 0.00001 --unal 0 --more-sensitive --outfmt 0

# Manual step of virus selection
## Output name => DM_cap3-NCBInr-significant.fasta

#################################################
# Mapping
bowtie2-build DM_cap3-NCBInr-significant.fasta seqstomap.index
bowtie2 -p 20 -q --no-mixed –no-discordant --no-unal -k 20 -x seqstomap.index -1 DM_1_map.fastq.gz -2 DM_2_map.fastq.gz | samtools view -@20 -Sb -o bothPE_bowtie2.bam
samtools view -b bothPE_bowtie2.bam | samtools sort -o bothPE_bowtie2.sorted.bam
samtools index -b bothPE_bowtie2.sorted.bam
samtools idxstats bothPE_bowtie2.sorted.bam > counts
